logo

Setup and Data

# Load Required Packages
require(pacman)
p_load(tidyverse, corrplot, MASS, extrafont, ggforce, ggtext, cowplot, pdftools, showtext, brant, DT, nnet, foreign, reactable, plotly)
extrafont::loadfonts(device="all")

# Set Working Directory
dir <- "/Users/treypallace/Documents/Data Analyst Portfolio/awards_project/"

# Read in the Data
awards <- read_csv(paste0(dir,"awards_data.csv"))
player_data <- read_csv(paste0(dir,"player_stats.csv"))
team_data <- read_csv(paste0(dir,"team_stats.csv"))
rebounding_data <- read_csv(paste0(dir,"team_rebounding_data_22.csv"))

Part 1 – Awards

1. Average number of points per game for players in the 2007-2021 seasons who won All NBA First, Second, and Third teams (not the All Defensive Teams), as well as for players who were in the All-Star Game (not the rookie all-star game)?

1) Data Cleaning

# Remove duplicates from awards and player_data
unique_awards <- awards %>% distinct(nbapersonid, season, .keep_all = TRUE)
unique_player_data <- player_data %>% distinct(nbapersonid, season, team, .keep_all = TRUE)

#Join player data and award data 
merged<-  left_join(unique_player_data, unique_awards, by = c("season", "nbapersonid"))

# Replace spaces in column names with '_'
colnames(merged) <- gsub(" ", "_", colnames(merged))

2) Calculate average number of points per game by dividing total points/total games

#  Function to calculate average points per game
calculate_average_ppg <- function(data_frame, col_names) {
  
  # Filters data on columns we want to calculate 
  filtered_data <- data_frame %>%
    filter(col_names == 1)

  # Sum points made
  total_points <- sum(filtered_data$points)
  
  # Sums games played
  total_games <- sum(filtered_data$games)

  if (total_games > 0) { # Check to make sure there is no division by 0
    # Calculate average by dividing total points by total games
    avg_ppg <- total_points / total_games
  } else {
    avg_ppg <- 0
  }

  return(avg_ppg)
}

# Apply function to specified columns
columns_to_apply<-c("All_NBA_First_Team", "All_NBA_Second_Team", "All_NBA_Third_Team", "all_star_game")

lapply(merged[columns_to_apply], function(col) {
  calculate_average_ppg(merged, col)
})
## $All_NBA_First_Team
## [1] 25.84562
## 
## $All_NBA_Second_Team
## [1] 22.89029
## 
## $All_NBA_Third_Team
## [1] 20.45424
## 
## $all_star_game
## [1] 21.68399

ANSWER 1:

1st Team: 25.8 points per game
2nd Team: 22.9 points per game
3rd Team: 20.5 points per game
All-Star: 21.7 points per game

2. average number of years of experience in the league it takes for players to make their first All NBA Selection (1st, 2nd, or 3rd team)? Sample will bel imited to players drafted in 2007 or later who did eventually go on to win at least one All NBA selection. For example:

  • Luka Doncic is in the dataset as 2 years. He was drafted in 2018 and won his first All NBA award in 2019 (which was his second season).
  • LeBron James is not in this dataset, as he was drafted prior to 2007.
  • Lu Dort is not in this dataset, as he has not received any All NBA honors.

1) Filter data to limit the sample to players drafted in 2007 or later who did eventually go on to win at least one All NBA selection

# Filter data 
filtered2<-merged %>%
  filter((draftyear >= 2007) & ( (All_NBA_First_Team ==1) | (All_NBA_Second_Team==1) | (All_NBA_Third_Team==1)))

2) Calcuate the average number of years of experience in the league it takes for players to make their first All NBA Selection

# Obtain the number of years between the year the player was drafted and each All NBA selection by subtracting 
# the year the award was received from the year the player was drafted and adding 1 
filtered2$experience<-filtered2$season - filtered2$draftyear +1

# Choose the --minimum-- value to get the years of experience for players to make their --first-- selection 
min_experience <- filtered2 %>%
  group_by(nbapersonid) %>%
  summarize(min_experience = min(experience, na.rm = TRUE))

# Calculate the average years of experiences needed to make their first All NBA selection
mean(min_experience$min_experience)
## [1] 4.682927

ANSWER 2:

4.68 Years

Build Dataset to Predict Career Outcomes

Data Cleaning

Data Cleaning Steps

1) Select columns needed to determine season outcomes

data <- merged %>%
  dplyr::select(
    draftyear,
    nbapersonid,
    season,
    team,
    All_NBA_First_Team,
    All_NBA_Second_Team,
    All_NBA_Third_Team,
    Most_Valuable_Player_rk,
    Defensive_Player_Of_The_Year_rk,
    all_star_game,
    mins,
    games_start
  )

2) What about players that played on multiple teams during the same season?

We don’t want to double count players that played on multiple teams in one season. We want a dataframe that can be uniquely identified by season and nbapersonid. To combine rows of players who played on multiple teams in one season, I sum the total number of minutes played and the total number of games started for each player in a season. The awards are independent of the team.

summarize_stats( ): This helper function creates two new columns for the total minutes played (total_mins) and total games started (total_games_start) for each player during one season. It also creates a new teams column with the value “multiple” inserted for players who played on more than one team during one season

rejoin_awards( ): This helper function joins the new summarized stats with the awards data.

remove_duplicates( ):This helper function keeps distinct combinations of season and id since the join added duplicates

#------------------------------ Helper Functions---------------------------------
summarize_stats <- function(df) {
  df %>%
  group_by(nbapersonid, season) %>%
  summarize(
    teams = ifelse(n() > 1, "multiple", unique(team)),
    total_mins = sum(mins),
    total_games_start = sum(games_start)
  ) %>%
  ungroup()
}

rejoin_awards <- function(df){
  df %>%
    left_join(data, by = c("nbapersonid", "season")) %>% 
    dplyr::select(-team, -mins, -games_start) # Remove previous columns
}


remove_duplicates<- function(df){
  df %>%
    distinct(nbapersonid, season, .keep_all = TRUE) 
}

#------------------------------ Main Functions---------------------------------
handle_multiple_teams <- function() {
  df_summary <-summarize_stats(data)
  
  joined <-rejoin_awards(df_summary)
  
  remove_duplicates(joined)
}


df3 <- handle_multiple_teams()

3) Expand dataframe to create rows for each player from their rookie year to 2021

We want a row for each player-season combination from their rookie year up to 2021. This allows their season outcome to be classified as “Out of League” for any year after they leave the NBA.

expand_df( ): This function first creates a dataframe with a row for every possible player-season combination (each player for each season 2007-2021). Since we don’t want to consider player-season combinations from before that player joined the league, it then removes any rows for seasons before their rookie season.

expand_df <-function() {
  # Create a sequence of seasons as numeric values from the minimum season to 2021
  season_range <- seq(2007,2021,1)
  
  # Create a data frame with all combinations of nbapersonid and season
  expanded_df <- expand.grid(nbapersonid = unique(df3$nbapersonid), season = season_range)

  # Merge the expanded_df with df3 to fill in missing values with NA
  all_years<- expanded_df %>%
    left_join(df3, by = c("nbapersonid", "season"))
  
  # Find each player's rookie year (or 2007 if they were drafted before 2007)
  rookie_seasons <- df3 %>%
    group_by(nbapersonid) %>%
    summarize(rookie_season = min(season))
  
  # Add rookie year column
  all_years_rookie <- all_years %>%
    left_join(rookie_seasons, by = "nbapersonid")
  
  # Only keep rows from rookie season - 2021
  all_years_rookie %>%
    filter(season >= rookie_season)

}

expanded_df <-expand_df()

4) Season Outcome Function

Write a function that defines the season outcome for each player’s season.

is_starter( ): This helper function determines if the season outcome is qualified as “Starter”. It takes into account the adjustments needed for 2011, 2019, and 2020. A player is a “Starter” if he started in at least 41 games in the season OR if he played at least 2000 minutes in the season.

is_rotation( ): This helper function determines if the season outcome is qualified as “Rotation”. It takes into account the adjustments needed for 2011, 2019, and 2020. A player is a “Rotation” player in a season if he played at least 1000 minutes in the season.

get_season_outcome( ): This function determines the season outcome for every player-season combination. It checks the qualifications for “Elite”, “All-Star”, “Starter”, “Rotation”, “Roster”, in that order. If none of the criteria is met, the player is qualified as “Out of League”.

#--------------------------- Helper functions-----------------------------------
is_starter <- function(season, mins, games_start) {
  ifelse(
    season == 2011,
    (mins * 82/66 >= 2000) | (games_start * 82/66 >= 41),
    ifelse(
      season %in% c(2019, 2020),
      (mins * 82/72 >= 1000) | (games_start * 82/72 >= 41),
      (mins >= 2000) | games_start >= 41
    )
  )
}

is_rotation <- function(season, mins) {
  ifelse(
    season == 2011,
    mins * 82/66 >= 1000,
    ifelse(
      season %in% c(2019, 2020),
      mins * 82/72 >= 1000,
      mins >= 1000
    )
  )
}


#--------------------------- Main function-----------------------------------
get_season_outcome <-function(df){
  df %>% 
    mutate(
    season_outcome = case_when(
      All_NBA_First_Team == 1 |
      All_NBA_Second_Team == 1 |
      All_NBA_Third_Team == 1 |
      Most_Valuable_Player_rk == 1 |
      Defensive_Player_Of_The_Year_rk == 1 ~ "Elite",
        
      all_star_game == 1 ~ "All-Star",
        
      is_starter(season, total_mins, total_games_start) ~ "Starter",
        
      is_rotation(season, total_mins) ~ "Rotation",
        
      total_mins >= 1 ~ "Roster",
        
      TRUE
      ~ "Out of League"
    )
  )
}

season_outcomes <- get_season_outcome(expanded_df)

5) Career Outcome Function

Write a function that defines the career outcome for each player.

list_season_outcomes( ): This helper function creates a new column containing a list of every season outcome of the player.

get_second_to_last( ): This helper function first keeps the all the season outcomes for four years after their rookie season. This is not simply removing the first four season outcomes since 2007 because some players were drafted before 2007. For example, a player drafter in 2004, could have season outcomes for 2004-2009. Therefore, this function keeps season outcomes from 2008-2009. Then the function checks to see if there are at least two remaining season outcomes in the data since we need to know the outcome of at least two seasons after his first four seasons to determine a career outcome. If there are not at least two season outcomes, they are classified as “Out-of-League”. Then the list is then ranked from lowest rank (“Out of League” = 1) to highest rank (“Elite” = 6). Finally, it returns the second to last value in the ordered list of season outcomes for each player. We want the second to last outcome since a “career outcome” represents the highest level of success that the player achieved for at least two seasons.

(Note this function cannot factor in historical data before 2007. For example, say a player was drafted in 1995 and had two Elite season outcomes from 2000-2002. However, if after 2007 they only earned season outcomes of All-Star until they left the league in 2010, their career outcome would be classified as “All-Star”. Even though they had at least two seasons after their first 4 as Elite, but this is not in the data.)

get_career_outcome( ): This function defines a career outcome for each player.

# Global Variable
rank_order <-c("Out of League", "Roster", "Rotation", "Starter", "All-Star", "Elite")

# -------------------------------- Helper functions ----------------------------
# Function to get distinct player statistic
get_player_stat <- function(data, target_var) {
  data %>%
    group_by(nbapersonid) %>%
    distinct({{ target_var }}, .keep_all = TRUE) %>%
    dplyr::select({{ target_var }})
}

draft_years <- get_player_stat(player_data, draftyear)

# Function to create the new a list of all season outcomes, ordered from lowest to highest
list_season_outcomes <- function(df) {
  df %>%
    group_by(nbapersonid) %>%
    summarize(ordered_season_outcome = list(factor(season_outcome, levels = rank_order))) %>%
    left_join(draft_years, by = "nbapersonid")
}


get_second_to_last <- function(lst, draftyear) {
  if (draftyear <= 2003) {
    lst <-lst  # No alteration if draftyear is <= 2003 since their season after their 1st four is 2007
  } else if (draftyear >= 2004 && draftyear <= 2006) {
    lst <- lst[-(1:(draftyear - 2003))]  # Remove seasons based on draft year (example if drafted in 2004, the season their first four seasons is 2008, so 2007 is removed)
  } else {
    lst <- lst[-(1:4)]  # Remove the first four seasons if drafted from 2007 and beyond
  }
  
  if (length(lst) >= 2) { #  Checks for players drafted in 2017 and after since they don't have enough season outcomes
    lst <- sort(lst)  # Sort the list in ascending order
    return(lst[length(lst) - 1])  # Returns the second to last element
  } else {
    return(1)  # If there are not at least two seasons after their rookie season, return 1 ("Out of League")
  }
}

#--------------------------------- Main Function --------------------------------

get_career_outcome <- function() {
  
  season_outcomes_clean<-season_outcomes %>%
  dplyr::select(nbapersonid, season, season_outcome, draftyear)
  
  season_outcomes_ranked <- list_season_outcomes(season_outcomes_clean)
  
  career_ranks <- season_outcomes_ranked %>%
    mutate(second_to_last = mapply(get_second_to_last, ordered_season_outcome, draftyear))
  
  
 career_ranks%>%
    mutate(career_outcome = rank_order[career_ranks$second_to_last])
}

career_outcomes <- get_career_outcome()
datatable(career_outcomes %>% dplyr::select(-second_to_last), 
          options = list(scrollX = '400px'))

3. How many players with 2010 listed as their draft years have a career outcome in each of the 6 buckets?

Find the value counts of each career outcome for players drafted in 2010.

# Join the career outcome data with  datadraft year
career_outcomes %>%
  filter(draftyear == 2010) %>%
  # Group by career_outcome and count the values
  group_by(career_outcome) %>%
  summarize(count = n())
## # A tibble: 6 × 2
##   career_outcome count
##   <chr>          <int>
## 1 All-Star           1
## 2 Elite              2
## 3 Out of League     43
## 4 Roster            10
## 5 Rotation           7
## 6 Starter           10

ANSWER 3:

Elite: 2 players.
All-Star: 1 players
Starter: 10 players.
Rotation: 7 players.
Roster: 10 players.
Out of League: 43 players.

Predictive Model

How does the Model Work: Mutlinomial Nominal Logistic Regression

The model is designed to assist in predicting NBA career outcomes for players based on their game performance statistics. The model considers NBA career outcomes on a scale of different categories, including as “Out of League,” “Roster,” “Rotation,” “Starter,” “All-Star,” and “Elite.” To train the model, historical data is used where the career outcomes of players are known, and this data is used to teach the model how different player statistics correspond to specific outcomes. Once the model has learned these patterns, it can then be applied to new or unseen data. To make predictions, our model uses a variety of player statistics. These statistics are drawn from a player’s performance throughout their career. Examples include points scored, assists, rebounds, and shooting percentages. This model can assist in assessing player potential, which can guide decisions related to contract extensions, draft picks, or trades. It helps in making more informed choices about building a competitive team. It’s important to note that the model is dynamic. As more data becomes available and player performance evolves, the model can be updated and refined to provide even more accurate predictions.

I first considered ordinal logistic regression. Ordinal logistic regression is typically used when you have a categorical outcome variable that has more than two levels. Specifically, ordinal logistic regression is used when there is a natural ordering to your outcome variable, and the distance between two outcomes is unknown. It is reasonable that an Elite outcome is superior to All-Star, and that All-Star is superior to Starter, but just how much “better” is unclear. I switched instead to a multinomial model. A multinominal model is more flexible as it has least restrictive assumptions, and it also gives you coefficients for every level. An ordinal logit would likely only fit better if the proportional odds assumption is met or really close to it. (Please see the appendix for the statistical tests and justification for not using the ordinal logistic regression model).

Preparing the Data

Calculate Summary Statistics by Season

# List of columns to sum
sum_columns <- c('All_NBA_Defensive_First_Team', 'All_NBA_Defensive_Second_Team', 'All_NBA_First_Team',
                  'All_NBA_Second_Team', 'All_NBA_Third_Team', 'Bill_Russell_NBA_Finals_MVP',
                  'Player_Of_The_Month', 'Player_Of_The_Week', 'Rookie_Of_The_Month', 'all_star_game',
                  'rookie_all_star_game')

# List of columns to average
avg_columns <- c('allstar_rk', 'Defensive_Player_Of_The_Year_rk', 'Most_Improved_Player_rk',
                  'Most_Valuable_Player_rk', 'Rookie_Of_The_Year_rk', 'Sixth_Man_Of_The_Year_rk',
                  'all_nba_points_rk', 'all_rookie_points_rk', 'games', 'games_start', 'mins', 'fgm',
                  'fga', 'fgp', 'fgm3', 'fga3', 'fgp3', 'fgm', 'fga2', 'fgp2', 'efg', 'ftm', 'fta', 'ftp',
                  'off_reb', 'def_reb', 'tot_reb', 'ast', 'steals', 'blocks', 'tov', 'tot_fouls', 'points',
                  'PER', 'FTr', 'off_reb_pct', 'def_reb_pct', 'tot_reb_pct', 'ast_pct', 'stl_pct', 'blk_pct',
                  'tov_pct', 'usg', 'OWS', 'DWS', 'WS', 'OBPM', 'DBPM', 'BPM', 'VORP')

# ---------------------------- Helper Functions --------------------------------
# Function to Summarize Data
summarize_dat<-function(df, minyear, maxyear) {
  df %>%
    filter((draftyear>=minyear) & (draftyear<=maxyear))%>%
    group_by(nbapersonid) %>%
    summarize(across(all_of(sum_columns), sum, .names = "{.col}_sum"),
            across(all_of(avg_columns), mean, .names = "{.col}_avg"))
}

# ------------------------------------------------------------------------------

# Summarize Data
sum_data<-summarize_dat(merged, 1991, 2015)

# Get draft pick for each player
draftpicks <- get_player_stat(player_data, draftpick)


# Add draftyear, draftpick, career_outcome to summarized 
add_columns <- data.frame(career_outcomes$nbapersonid, career_outcomes$draftyear, draftpicks$draftpick, career_outcomes$second_to_last, career_outcomes$career_outcome)

# Clean column names
colnames(add_columns) <- sub("^[^.]+\\.", "", colnames(add_columns)) 
final_data<- left_join(sum_data, add_columns, by = "nbapersonid")
final_data <- final_data %>%
  dplyr::rename(career_outcome_rank = second_to_last) %>%
  dplyr::select_if(~ !all(. == 0)) %>% # remove all 0 columns
  dplyr::select(-Most_Valuable_Player_rk_avg, -Rookie_Of_The_Year_rk_avg) # only had 1 non 0 value so removed

final_data[is.na(final_data)] <- 0 # replace any NAs with 0

Variable Screening

Explore which Variables Have Correlations with Career Outcome

# Choose numeric columns 
numeric_columns <- sapply(final_data, is.numeric)

# Extract the names of numeric columns
numeric_column_names <- names(final_data)[numeric_columns]

# Remove nbapersonid
numeric_column_names<-numeric_column_names[-1]

# Subset the data to only include numeric data
num_data <- final_data %>%
  dplyr::select(all_of(numeric_column_names)) 


# Calculate Correlations  
correlations <- cor(num_data, use = "pairwise.complete.obs") 
cor_with_career_outcome <- correlations[,"career_outcome_rank"]
correlation_data <- data.frame(Variable = names(cor_with_career_outcome), Correlation = cor_with_career_outcome)

Visualize the Correlation Data

p1 <- ggplot(correlation_data, aes(x = reorder(Variable, Correlation), y = Correlation,
                                   text = paste("Variable:", Variable, "<br>Correlation:", Correlation))) +
  geom_bar(stat = "identity", aes(fill = Correlation > 0.30)) +
  geom_hline(yintercept = 0.3, linetype = "dashed", color = "#f6ab24") +  
  labs(y = "Variable", x = "Correlation") +
  ggtitle("Correlation with Career Outcome Rank") +
  scale_y_continuous(limits = c(-0.25, 1), breaks = seq(-0.25, 1, 0.25)) +
  theme_minimal() +
  theme(text = element_text(family="CMU Sans Serif", color = 'black'),
        axis.text.y = element_text(hjust = 1, size = 10)) +  
  coord_flip() +
  scale_fill_manual(values = c("#e62428", "#1067b4"))
# Convert ggplot to plotly
ggplotly(p1, tooltip = "text")

Variables with a correlation greater than 0.3 moved forward in the variable selection process.

Check For Multicollinearity

correlated_vars<-c(rownames(correlation_data %>%
  filter(abs(Correlation)>0.3)))

check_multi_col<-final_data %>%
  dplyr::select(all_of(correlated_vars))%>%
  dplyr::select(-career_outcome_rank)

library(corrplot)

corr<-cor(check_multi_col,use="pairwise.complete.obs")

high_corr_combinations <- list()

# Get the number of variables
num_vars <- ncol(corr)

# Loop through all possible pairs of variables
for (i in 1:(num_vars - 1)) {
  for (j in (i + 1):num_vars) {
    if (!is.na(corr[i, j]) && abs(corr[i, j]) > 0.7) {
      high_corr_combinations[[length(high_corr_combinations) + 1]] <- c(rownames(corr)[i], rownames(corr)[j], corr[i,j])
    }
  }
}

Visualize the Correlation Matrix

par(family="CMU Sans Serif")
corrplot(abs(corr),method="color",
         order = 'AOE', 
         type = 'lower',
         col=colorRampPalette(c("#e62428","white","#1067b4"))(100),cl.lim=c(0,1),
         tl.col = "black", 
         tl.cex = 0.9)

Variable Selection

To avoid multicollinearity, the following variable were removed from the model: VORP_avg, OWS_avg, DWS_avg, WS_avg, tot_fouls_avg, off_reb_avg, tot_reb_avg ,ftm_avg, fta_avg, fgm2_avg, fga_avg, fgm3_avg, BPM_avg, fga2_avg, tov_avg,and PER_avg

Fit the Model

# Set a reference Level
final_data$career_outcome <- relevel(factor(final_data$career_outcome, levels = rank_order), ref = "Out of League")

# Build the model
multinom <- multinom(career_outcome ~ games_avg + games_start_avg + mins_avg + fgm_avg + fga3_avg + def_reb_avg +ast_avg + steals_avg + blocks_avg + points_avg  + OBPM_avg , data = final_data)
## # weights:  78 (60 variable)
## initial  value 2003.187087 
## iter  10 value 1626.265092
## iter  20 value 1389.445646
## iter  30 value 1310.513004
## iter  40 value 1286.330318
## iter  50 value 1106.111217
## iter  60 value 968.547469
## iter  70 value 873.267285
## iter  80 value 870.030917
## iter  90 value 869.786560
## iter 100 value 869.783377
## final  value 869.783377 
## stopped after 100 iterations

Multinomial Model Performance

# Run an empty model with no predictors
OIM <- multinom(career_outcome ~ 1, data = final_data)
## # weights:  12 (5 variable)
## initial  value 2003.187087 
## iter  10 value 1619.021319
## final  value 1619.021255 
## converged
anova(OIM,multinom)
## Likelihood ratio tests of Multinomial Models
## 
## Response: career_outcome
##                                                                                                                                   Model
## 1                                                                                                                                     1
## 2 games_avg + games_start_avg + mins_avg + fgm_avg + fga3_avg + def_reb_avg + ast_avg + steals_avg + blocks_avg + points_avg + OBPM_avg
##   Resid. df Resid. Dev   Test    Df LR stat. Pr(Chi)
## 1      5585   3238.043                              
## 2      5530   1739.567 1 vs 2    55 1498.476       0

The chi-square test analyzes the decrease in unexplained variance from the baseline model (3238.043) to the final model (1739.567), which is a difference of 1543.47. This change is significant, which means that our final model explains a significant amount of the original variability. The likelihood ratio with a p-value = 0 tells us that the model as a whole fits significantly better than an empty (i.e., a model with no predictors).

Strengths and Weaknesses of the Model

Strengths

Consideration of Many Variables: The model takes into account a wide range of player statistics, allowing it to capture nuanced and multifaceted aspects of player performance. This consideration of many variables enhances the model’s ability to make accurate predictions.

Sample Size: The dataset includes a substantial sample size of 1,118 NBA players. This large sample size provides statistical robustness and increases confidence in the model’s predictions.

Model Performance: Based on the chi-square test and likelihood ratio test, the model explains a significant amount of variability. In other words, the model can provide a reasonable representation of how the data points are spread out and how they change or vary with respect to the different variables.

Interpretability: Multinomial logistic regression provides clear parameter estimates, simplifying the identification of influential player statistics in career outcomes. It’s easier to interpret than more complex methods, like deep learning or random forests, making it an accessible tool for informed decision-making in sports analytics.

Easy Model Updates: The model can be easily updatable with new data. In dynamic environments like professional sports, player performance data is continually evolving. The model can be retrained with the latest data to adapt to changing trends and player dynamics. This flexibility allows sports analysts and decision-makers to stay current and make real-time adjustments, ensuring that the model remains relevant and effective over time.

Weaknesses

Sample Balance: There are more players in the dataset having “Out of League” or “Roster” outcomes compared to rarer outcomes like “Elite”.There’s a higher chance that the model will perform well on the majority class but poorly on the minority class i.e. it might predict “Rotation” or “Starter” more accurately but struggle with “Elite” players.

Assumptions: Multinomial logistic regression assumes that the relationships between predictors and outcomes are linear. If the relationships are highly nonlinear, the model’s accuracy may suffer.

Addressing Weaknesses with More Time and Data

Data Augmentation: Expanding the dataset by collecting more player statistics over time can help increase the sample size, especially for rare career outcomes. This enhances the model’s ability to predict less common scenarios like “Elite.”

Incorporate External Factors: Consider incorporating external factors like team performance, marketability, and injury history into the model to account for additional influences on career outcomes.

Ensemble Methods: Combine multiple models, each trained on different subsets of the data or with different parameters, to improve overall prediction performance.

Advanced Modeling Techniques: Exploring more advanced machine learning techniques, such as random forests or gradient boosting, may capture complex relationships in the data that multinomial logistic regression cannot. These methods are more flexible and also less sensitive to class imbalance.

A random forest model exammple:

Data Splitting:

  • Divide your the data into two subsets: a training set and a testing set. The training set will be used to train the Random Forest model, while the testing set will be used to evaluate its performance.

Model Training:

  • Train a Random Forest classifier on your training data.
# library(randomForest)
# 
# # 'data' is the training dataset
# model <- randomForest(career_outcome ~ ., data = data)

Hyperparameter Tuning:

  • Random Forest has hyperparameters that you can tune to optimize model performance. Common hyperparameters include the number of trees (ntree), the number of features to consider at each split (mtry), and the maximum depth of each tree (max.depth).

  • I would use techniques like cross-validation and grid search to find the best combination of hyperparameters.

# Example of hyperparameter tuning
#tuneRF(data, tunecontrol = randomForest::tuneControl(N = 10))

Feature Importance:

  • Random Forest can provide feature importance scores. These scores indicate the contribution of each player statistic to the prediction of career outcomes.
# # Get feature importance scores
# importance <- importance(model)

Predictions for Players Drafted in 2018-2021

# Filter Data to players drafter in 2018 and after and compute summary statistics
predict_players_data <- summarize_dat(merged, 2018, 2021)

# Replaces NAs with 0s
predict_players_data[is.na(predict_players_data)] <- 0

colnames<-c( "nbapersonid", "Out of League" , "Roster" ,  "Rotation" ,   "Starter",  "All-Star", "Elite")

# Make the Prediction
pred_2018<-cbind(predict_players_data,
      predict(multinom, predict_players_data, type="p"))  %>%
  dplyr::select(all_of(colnames))

datatable(pred_2018, options = list(scrollX = '400px'))

Predictions for Shai Gilgeous-Alexander, Zion Williamson, James Wiseman, and Josh Giddey.

# Filter Data to just these 4 players
predict_players<-player_data %>%
  filter(player %in%c( 'Shai Gilgeous-Alexander', 'Zion Williamson', 'James Wiseman', 'Josh Giddey'))%>%
  dplyr::select(nbapersonid, player)

# Summarize the Data
predict_players_data <-summarize_dat(merged, 1991, 2021)  %>%
  filter(nbapersonid %in% predict_players$nbapersonid)

predict_players_data[is.na(predict_players_data)] <- 0

colnames<-c( "nbapersonid", "Out of League" , "Roster" ,  "Rotation" ,   "Starter",  "All-Star", "Elite")

# Make the Prediction
final_predict<-cbind(predict_players_data,
      predict(multinom, predict_players_data, type="p"))  %>%
  dplyr::select(all_of(colnames))

Sankey Diagram for Results

# Convert Data to Long Format
final_predict_long <- pivot_longer(final_predict, 
                                   cols = -nbapersonid, 
                                   names_to = "career_outcome", 
                                   values_to = "value")

player_levels<-c('1628983','1629627' ,'1630164','1630581')

final_predict_long$nbapersonid <- factor(final_predict_long$nbapersonid, levels = player_levels)

# Adjust data to prepare for Sankey Diagram
prediction_results <- final_predict_long %>%
  gather_set_data(1:2) %>%
  mutate_at(vars(nbapersonid),
            funs(factor(., levels = player_levels))) %>%
  mutate_at(vars(career_outcome),
            funs(factor(., levels = rev(rank_order))))


font_add_google("Roboto Condensed", family = "roboto")
ggplot(prediction_results, aes(
  x = x,
  id = id,
  split = factor(
    y,
    levels = c(
      "1628983",
      "1629627",
      "1630164",
      "1630581",
      "Elite",
      "All-Star",
      "Starter",
      "Rotation",
      "Roster",
      "Out of League"
    )
  ),
  value = value
)) +
  geom_parallel_sets(
    aes(fill = nbapersonid),
    alpha = 0.8,
    axis.width = -0.01,
    n = 100,
    strength = 0.6
  ) +
  geom_parallel_sets_axes(fill = "grey20", axis.width = -0.01) +
  scale_color_manual(
    values = c(
      "1628983" = "#f6ab24",
      "1629627" = "#18224e",
      "1630164" = "#e62428",
      "1630581" = "#1067b4"
    )
  ) +
  scale_fill_manual(
    values = c(
      "1628983" = "#f6ab24",
      "1629627" = "#18224e",
      "1630164" = "#e62428",
      "1630581" = "#1067b4"
    )
  )+  
  annotate(
    geom = "text",
    x = 2.07,
    y = 4.2,
    label = "Elite",
    size = 6,
    family = "roboto"
  ) +  
  annotate(
    geom = "text",
    x = 2.1,
    y = 2.9,
    label = "All-Star",
    size = 6,
    family = "roboto"
  ) + 
  annotate(
    geom = "text",
    x = 2.1,
    y = 1.8,
    label = "Starter",
    size = 6,
    family = "roboto"
  ) +
    annotate(
    geom = "text",
    x = 2.105,
    y = 0.9,
    label = "Rotation",
    size = 6,
    family = "roboto"
  ) +
    annotate(
    geom = "text",
    x = 2.09,
    y = 0.4,
    label = "Roster",
    size = 6,
    family = "roboto"
  ) +
    annotate(
    geom = "text",
    x = 2.16,
    y = 0.05,
    label = "Out of League",
    size = 6,
    family = "roboto"
  ) +
    annotate(
    geom = "text", 
    x = 0.75,
    y = 4.1,
    label = "Shai Gilgeous-Alexander",
    size = 6,
    family = "roboto"
  ) +
    annotate(
    geom = "text",
    x = 0.8,
    y = 2.9,
    label = "Zion Williamson",
    size = 6,
    family = "roboto"
  ) +
    annotate(
    geom = "text",
    x = 0.8,
    y = 1.9,
    label = "James Wiseman",
    size = 6,
    family = "roboto"
  ) +
    annotate(
    geom = "text",
    x = 0.8,
    y = 0.6,
    label = "Josh Giddey",
    size = 6,
    family = "roboto"
  ) +
  theme_minimal() +
  theme(
    legend.position = "none",
    plot.background = element_rect(fill = "white", color = "transparent"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text = element_blank(),
    axis.title = element_blank()
  ) +
  coord_cartesian(clip = "off") +
  theme(plot.margin = unit(c(2, 5, 2,5), "lines")) 

HTML Result Table for Players Drafted in 2019-2021

# Filter Data to players drafter in 2018 and after and compute summary statistics
player_names <- get_player_stat(player_data, player)

# Summarize Data and add pLayer names
predict_players_data <-summarize_dat(merged, 2019, 2021)  %>%
  left_join(player_names)

# Replaces NAs with 0s
predict_players_data[is.na(predict_players_data)] <- 0

colnames<-c( "nbapersonid", "player","Out of League" , "Roster" ,  "Rotation" ,   "Starter",  "All-Star", "Elite")

# Make the Prediction
pred_2019<-cbind(predict_players_data,
      predict(multinom, predict_players_data, type="p"))  %>%
  dplyr::select(all_of(colnames))

# Find the outcome  with the highest probability for each row
pred_2019$most_likely_outcome <- names(pred_2019[, rank_order])[apply(pred_2019[, rank_order], 1, which.max)]

reactable(pred_2019)

Part 2 – Predicting Team Stats

1. OKC’s predicted offensive rebound percent is for game 81 in the data. That is, use games 1-80 to predict game 81.

# Filter to include rows with OKC as the offensive team and games 1-80
okc <- rebounding_data %>%
  filter((team == "OKC" ) & (game_number %in% seq(1,80)))

# Find OKC's predicted offensive rebound percent is for game 81
sum(okc$offensive_rebounds)/sum(okc$off_rebound_chances) * 100
## [1] 28.86898

ANSWER 1:

28.9%

Changes tobetter account for missing players.

ANSWER 2:

  1. Player-Specific Impact Model:
    • Change: Create a model that estimates the offensive rebounding impact of individual players.
    • Implementation: For each player, calculate their Offensive Rebound Rate (ORR), which is the ratio of offensive rebounds to offensive rebound opportunities (missed shots while the player is on the court). Sum the ORR of all players expected to play in the next game.
  2. Opponent Defensive Rebounding Model:
    • Change: Account for the opponent’s defensive rebounding ability.
    • Implementation: Calculate the opponent’s Defensive Rebound Rate (DRR) and consider it as a factor in the offensive rebounding prediction. For instance, you can use a weighted average of the team’s historical offensive rebounding performance and the opponent’s historical defensive rebounding performance.
  3. Real-time Updates Model:
    • Change: Implement a real-time update system for player availability.
    • Implementation: Develop a script or algorithm that continuously monitors injury reports and lineup announcements. If a key player is reported as unavailable, adjust the offensive rebounding prediction accordingly by modifying the player-specific ORR.
  4. Expert Insights Integration:
    • Change: Incorporate expert insights into the model.
    • Implementation: Create a weight or scaling factor that represents the confidence level in expert insights. Multiply this factor with the predictions from the model. For example, if experts indicate that the absence of a key player will significantly impact offensive rebounding, the weight can be higher.

Model Weaknesses

ANSWER 3:

  1. Non-Stationarity:
    • Weakness: The simple average model assumes that offensive rebounding percentages remain constant over time. However, team dynamics, player performance, and strategies can change during a season, leading to non-stationary data.
    • Solution: Implement time series analysis techniques to capture temporal trends and seasonality in offensive rebounding percentages. Techniques such as exponential smoothing or autoregressive integrated moving average (ARIMA) modeling can help account for changes in performance over time. Additionally, consider using rolling averages or weighted averages that give more weight to recent games to adapt to evolving team dynamics.
  2. Outliers and Anomalies:
    • Weakness: The model is sensitive to outliers and anomalies in the data, which can distort the average. Unexpected events or extreme performances in individual games may not be appropriately addressed.
    • Solution: Apply outlier detection methods to identify and handle extreme values in the data. For example, you can use statistical tests or machine learning algorithms to flag and exclude outliers from the averaging process. Alternatively, consider using robust statistics that are less influenced by outliers when calculating averages.
  3. Sample Size Variation:
    • Weakness: The model treats all games equally, regardless of variations in the number of offensive rebounding opportunities in each game.
    • Solution: Normalize offensive rebounding percentages by the number of offensive rebounding chances (missed shots) in each game. This accounts for differences in sample size and provides a more accurate measure of offensive rebounding efficiency. Additionally, you can consider weighting games based on their importance or significance in your analysis.

Appendix

Ordinal Logistic Regression Model

ologit <- polr(career_outcome ~ games_avg + games_start_avg + mins_avg + fgm_avg + fga3_avg + def_reb_avg +ast_avg + steals_avg + blocks_avg + points_avg  + OBPM_avg , data = final_data, Hess = T)

summary(ologit)
## Call:
## polr(formula = career_outcome ~ games_avg + games_start_avg + 
##     mins_avg + fgm_avg + fga3_avg + def_reb_avg + ast_avg + steals_avg + 
##     blocks_avg + points_avg + OBPM_avg, data = final_data, Hess = T)
## 
## Coefficients:
##                     Value Std. Error t value
## games_avg       -0.025276  0.0111345 -2.2700
## games_start_avg  0.041733  0.0117133  3.5629
## mins_avg         0.002662  0.0008278  3.2162
## fgm_avg         -0.024741  0.0070944 -3.4874
## fga3_avg        -0.003523  0.0015511 -2.2712
## def_reb_avg      0.007168  0.0019981  3.5874
## ast_avg          0.003833  0.0016099  2.3811
## steals_avg      -0.003366  0.0076215 -0.4416
## blocks_avg       0.008740  0.0058956  1.4825
## points_avg       0.011667  0.0028267  4.1276
## OBPM_avg         0.071420  0.0267750  2.6674
## 
## Intercepts:
##                      Value   Std. Error t value
## Out of League|Roster  1.6735  0.2070     8.0862
## Roster|Rotation       3.2073  0.2313    13.8669
## Rotation|Starter      4.7414  0.2598    18.2468
## Starter|All-Star     10.4976  0.4738    22.1557
## All-Star|Elite       12.2196  0.5500    22.2160
## 
## Residual Deviance: 1861.736 
## AIC: 1893.736

Evaluating the Model

Assumption of Proportional Odds

Brant Test assesses whether the observed deviations from our Ordinal Logistic Regression model are larger than what could be attributed to chance alone.

brant(ologit)
## ---------------------------------------------------- 
## Test for     X2  df  probability 
## ---------------------------------------------------- 
## Omnibus          241.15  44  0
## games_avg        13.16   4   0.01
## games_start_avg  17.86   4   0
## mins_avg     16.85   4   0
## fgm_avg          18.24   4   0
## fga3_avg     15.03   4   0
## def_reb_avg      9.22    4   0.06
## ast_avg          10.87   4   0.03
## steals_avg       11.92   4   0.02
## blocks_avg       4.51    4   0.34
## points_avg       24.94   4   0
## OBPM_avg     9.7 4   0.05
## ---------------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds

Since the p-values for each variable, except blocks_avg, are less than any reasonable alpha level, we reject the null hypothesis that the parallel regression (proportional odds) assumption is met.

Goodness of Fit

The Lipsitz test is a goodness of fit test for ordinal response logistic regression models.

library(generalhoslem)
lipsitz.test(ologit)
## 
##  Lipsitz goodness of fit test for ordinal response models
## 
## data:  formula:  career_outcome ~ games_avg + games_start_avg + mins_avg + fgm_avg + formula:      fga3_avg + def_reb_avg + ast_avg + steals_avg + blocks_avg + formula:      points_avg + OBPM_avg
## LR statistic = 23.006, df = 9, p-value = 0.006183

Since the p-value of 0.006 is less than any reasonable alpha level, we reject the null hypothesis, indicating a poor correlation between predicted and observed values.